//  Date:         	08/07/2018
//  task:         	education analysis
//  project:      	World Develompent
//  user-written: 	sutex, sxpose, center 

capture log close
clear all
macro drop _all
set more off
set matsize 800 

// plottig is not installed in Stata by default
set scheme plottig, permanently

//  #0:  setup
**************************
do make_index_gr

log using Prepare_school,replace text


//  #1: load data
****************************************************
use ubridge_schools.dta

*drop quasi control schools
drop if cluster_id==.


//  #2: drop variables that have all missing values and clearly unnecessary 
****************************************************
// findname is not installed in Stata by default
findname, all(missing(@))
drop `r(varlist)'
drop treat_original NEAR_FID quasicontrol_school

foreach var in ORIG_FID NEAR_DIST NEAR_X NEAR_Y{
	move `var' V_deo_visit_rec
	}

// #3: rename variables too long to store in regression 
****************************************************
rename V_teacher_absent_avg Teacher_Absent	
ren BE_q9_q14_rec V_teachers_outside
replace V_teachers_outside=V_teachers_outside*-1
	rename V_students_per_supply1 students_supply1
	rename V_students_per_supply2 students_supply2
	rename V_students_per_supply3 students_supply3

rename q23 schooltype
bys schoolid: egen a = mean(schooltype)
replace schooltype = a
drop a
replace schooltype = 3 if schooltype==.

bys treat: tab schooltype,mis
tabulate schooltype treat, chi2 exact
g cath = 0 
	replace cath = 1 if schooltype==2
g cou = 0
	replace cou = 1 if schooltype==3
g noaffil = 0
	replace noaffil = 1 if schooltype==1
ttest cath, by(treat)
ttest cou, by(treat)
ttest noaffil, by(treat)


	
// #4: select vars for analysis 
****************************************************
local monitoring V_deo_ever_call V_insp_calls V_deo_visit_rec V_isp_visit_rec A_Q3_insprep 
local effort Teacher_Absent V_present_teach_e V_perc_alotwritten V_perc_Engaged V_school_staff_meet  
local input V_n_teachers V_teach_transf_to students_supply1 students_supply2 students_supply3 
local outcome A_enrollment A_PLE_Grade1rate A_PLE_Grade2rate A_PLE_passrate
local covs DataSetID CM_YN DM_YN RV_YN HC_YN PS_YN SCHOOL pop_lc1 adult_pop age_lc1 largest_eth_gp_share ///
lugbara_share HHI_Ethnicity HHI_Religion literacy_lc1 age_lc1 employed_share_lc1 secondary_lc1 ///
Nonagriculture_share_lc1 sc_frac sc_pol HC_dist Arua_dist poverty_census_sc education_sc ///
Message_ct RelevantMess_ct schooltype cath cou noaffil 
 
 
// #5: keep only vars for analysis
****************************************************
keep `monitoring' `effort' `input' `outcome' `covs' treat schoolid V_audit_period cluster_id

*monitoring
	lab var V_deo_visit_rec "DEO visits"
	lab var V_deo_ever_call "DEO calls"
	lab var V_isp_visit_rec "Inspector visits"
	lab var V_insp_calls "Inspector calls"
	lab var A_Q3_insprep "Inspector reports"
*effort
	lab var Teacher_Absent "% Teachers present (records)"
	lab var V_present_teach_e "% Teachers present (observed)"
	lab var V_perc_alotwritten "Meaningful board"
	lab var V_perc_Engaged "Teacher engaged"
	lab var V_school_staff_meet "Staff meetings"
	
*inputs 
	lab var V_n_teachers "N. teachers employed"
	lab var V_teach_transf_to "Teachers transferred to school"
	lab var students_supply1 "Students per uniform"
	lab var students_supply2 "Students per book"
	lab var students_supply3 "Students per pencil"

*outcome
	lab var A_enrollment "Enrollment"
	lab var A_PLE_Grade1rate "% PLE Grade 1"
	lab var A_PLE_Grade2rate "% PLE Grade 2"
	lab var A_PLE_passrate "PLE pass rate" 

foreach v in `monitoring' `effort' `input' `outcome'  {
        local l`v' : variable label `v'
            if `"`l`v''"' == "" {
            local l`v' "`v'"
        }
}

* label vars
lab var treat "Treatment"
lab define treat 0 "Control" 1 "Treatment"
lab value treat treat

lab define period 1 "Base" 2 "mid" 3 "end", modify
lab value V_audit_period period

*************************************************************
// #6: plot raw outcomes overtime
*************************************************************

graph drop   _all

foreach v in `monitoring' `effort' `input'{
	graph bar `v', over(V_audit_period) over(treat) ytitle(Mean `l`v'') title(`l`v'') name(`v')
}

*************************************************************
* FIGURE 7 in Supplementary Information
*************************************************************

graph combine `monitoring', scheme(lean1)	

*************************************************************
* FIGURE 8 in Supplementary Information
*************************************************************
	
graph combine `effort', scheme(lean1)

*************************************************************
* FIGURE 9 in Supplementary Information
*************************************************************

graph combine `input', scheme(lean1)

		
*********************************************************
// #7: produce summary statistics for outcome variables 
*********************************************************

*********************************************************
* Table 1 in Supplementary Information
*********************************************************

sum  V_deo_ever_call  V_insp_calls  V_deo_visit_rec V_isp_visit_rec A_Q3_insprep Teacher_Absent V_present_teach_e V_perc_alotwritten V_perc_Engaged V_school_staff_meet V_n_teachers V_teach_transf_to students_supply1 students_supply2 students_supply3 A_enrollment A_PLE_Grade1rate A_PLE_Grade2rate A_PLE_passrate

*********************************************************	
// #8: reshape data
*********************************************************

reshape wide `monitoring' `effort' `input' `outcome', i(schoolid) j(V_audit_period) 

foreach v in `monitoring' `effort' `input' `outcome'  {
        label var `v'1 "`l`v'' (baseline)" 
		label var `v'2 "`l`v'' (midline)"
		label var `v'3 "`l`v'' (endine)"
}

*********************************************************
// #9: table for balance of outcomes variable in baseline 
*********************************************************

local monitoring V_deo_ever_call V_insp_calls V_deo_visit_rec V_isp_visit_rec A_Q3_insprep 
local effort Teacher_Absent V_present_teach_e V_perc_alotwritten V_perc_Engaged V_school_staff_meet  
local input V_n_teachers V_teach_transf_to students_supply1 students_supply2 students_supply3 
local outcome A_enrollment A_PLE_Grade1rate A_PLE_Grade2rate A_PLE_passrate

tempfile tmpfile

// regsave is not installed in Stata by default

	foreach y in `monitoring' `effort' `input' `outcome'{ 
	reg `y'1 treat, cl(cluster_id)
	local lab : variable label `y'1
	regsave using  "`tmpfile'" ///
		, addlabel(Variable,"`lab'") pval table(`y', parentheses(stderr) format(%8.2fc)) `replace'
	local replace "append"
	}

preserve
use "`tmpfile'", clear
replace var = subinstr(var,"_coef","",1)
replace var = "" if strpos(var,"stderr")!=0

// sxpose is not installed in Stata by default

*transpose the data 
	sxpose, clear

*dropping constant sd 
drop _var5 _var6 _var8 _var2
order _var9 _var4 _var1  _var3 _var7

unab vars : _var9 - _var7
foreach v in `vars' {
	replace `v' = substr(`v' , 2 , .) if regexm(`v' , "^_")
	local n = `v'[1]
	rename `v' `n'
	}
drop in 1

lab var cons "C mean"
lab var treat "T-C"
lab var treat_pval "P-val"
lab var N "N"

**********************************************************************	
* Table 5 in Supplementary Information
**********************************************************************	

list, sep(5)

restore

*********************************************************
// #10: define covariates
*********************************************************
	replace HC_dist=HC_dist/1000
	lab var HC_dist "Distance to health center (km)" 
	
	replace Arua_dist=Arua_dist/1000
	lab var Arua_dist "Distance to Arua (km)" 
	
	gen ldist=log(Arua_dist)
	lab var ldist "Log distance to Arua" 
	su Arua_dist ldist, de

renvars *_sc,   subst(_sc ) // renvars is not installed in Stata by default
renvars *_lc1,  subst(_lc1 ) 	

ren sc_frac fractionalization
ren sc_pol polarization
	lab var fractionalization "Religious fractionalization"

gen lpop=log(adult_pop)
lab var lpop "Log adult population"
su adult_pop lpop, de


foreach y in lpop age poverty_census lugbara_share HHI_Ethnicity polarization HHI_Religion literacy education employed_share Nonagriculture_share Arua_dist{
	 egen `y'_median=median(`y')
	 gen `y'_miss = `y'==.
	 tab `y'_miss
	 replace `y' = `y'_median if `y'==.
	 center `y'  , standardize   // center is not installed in Stata by default
	 drop `y'_median
}	

*********************************************************
// #11: Construct education indices
*********************************************************

forvalues i=1(1)3{
	gl monitoring`i' V_deo_ever_call`i' V_insp_calls`i' V_deo_visit_rec`i' V_isp_visit_rec`i' A_Q3_insprep`i'
}

foreach var in $monitoring1 $monitoring2 $monitoring3{
	quietly summarize `var' if treat==0
	local `var'_mean= r(mean)
	local `var'_sd= r(sd) 
	gen c_`var' = (`var'-``var'_mean')/``var'_sd'
	qui egen mean_std_`var'=mean(c_`var') if treat==1 
	replace c_`var' = mean_std_`var' if treat==1 & c_`var'==. | `var'==.d
	replace c_`var' = 0 if treat==0 & c_`var'==. | `var'==.d
	qui egen mean_`var'1=mean(`var') if treat==1
	replace `var' = mean_`var'1 if treat==1 & `var'==. | `var'==.d
	qui egen mean_`var'0=mean(`var') if treat==0
	replace `var' = mean_`var'0 if treat==0 & `var'==. | `var'==.d
	corr `var' c_`var'
	}

* Weightd index (based on Anderson 2008)
gen wgt	=1
gen stdgroup=treat==0
	
forvalues i=1(1)3{	
	corr  c_V_deo_ever_call`i' c_V_insp_calls`i' c_V_deo_visit_rec`i' c_V_isp_visit_rec`i' c_A_Q3_insprep`i'
	alpha c_V_deo_ever_call`i' c_V_insp_calls`i' c_V_deo_visit_rec`i' c_V_isp_visit_rec`i' c_A_Q3_insprep`i', std item
	egen monitoring_index`i'=rowmean( c_V_deo_ever_call`i' c_V_insp_calls`i' c_V_deo_visit_rec`i' c_V_isp_visit_rec`i')	
	make_index_gr edu_monitoring`i' wgt stdgroup V_deo_ever_call`i' V_insp_calls`i'   V_deo_visit_rec`i'   V_isp_visit_rec`i'
	corr monitoring_index`i' index_edu_monitoring`i' 
	}

*****************
*education effort
*****************

forvalues i=1(1)3{
	gl effort`i' Teacher_Absent`i' V_present_teach_e`i' V_perc_alotwritten`i' V_perc_Engaged`i' V_school_staff_meet`i' 
}
    
 foreach var in $effort1 $effort2 $effort3{
	quietly summarize `var' if treat==0
	local `var'_mean= r(mean)
	local `var'_sd= r(sd) 
	gen c_`var' = (`var'-``var'_mean')/``var'_sd'
	qui egen mean_std_`var'=mean(c_`var') if treat==1
	replace c_`var' = mean_std_`var' if treat==1 & c_`var'==. | `var'==.d
	replace c_`var' = 0 if treat==0 & c_`var'==. | `var'==.d
	qui egen mean_`var'1=mean(`var') if treat==1
	replace `var' = mean_`var'1 if treat==1 & `var'==. | `var'==.d
	qui egen mean_`var'0=mean(`var') if treat==0
	replace `var' = mean_`var'0 if treat==0 & `var'==. | `var'==.d
	*corr `var' c_`var'
	}

* index using Anderson (2008)	
	make_index_gr edu_effort1 wgt stdgroup Teacher_Absent1 V_present_teach_e1 V_perc_alotwritten1 V_perc_Engaged1 V_school_staff_meet1
	make_index_gr edu_effort2 wgt stdgroup Teacher_Absent2 V_present_teach_e2 V_perc_alotwritten2 V_perc_Engaged2
	make_index_gr edu_effort3 wgt stdgroup Teacher_Absent3 V_present_teach_e3 V_perc_alotwritten3 V_perc_Engaged3 V_school_staff_meet3
	
* Index using Kling et al., (2007)
forvalues i=1(2)3{
	corr c_Teacher_Absent`i' c_V_present_teach_e`i' c_V_perc_alotwritten`i' c_V_perc_Engaged`i' c_V_school_staff_meet`i'
	alpha c_Teacher_Absent`i' c_V_present_teach_e`i' c_V_perc_alotwritten`i' c_V_perc_Engaged`i' c_V_school_staff_meet`i', std item	
	egen effort_index`i'=rowmean(c_Teacher_Absent`i' c_V_present_teach_e`i' c_V_perc_alotwritten`i' c_V_perc_Engaged`i' c_V_school_staff_meet`i')									
	corr effort_index`i' index_edu_effort`i'
	}

	egen effort_index2=rowmean(c_Teacher_Absent2 c_V_present_teach_e2 c_V_perc_alotwritten2 c_V_perc_Engaged2)									
	corr effort_index2 index_edu_effort2
	
******************
* education inputs
******************

forvalues i=1(1)3{
	gl inputs`i' V_n_teachers`i' V_teach_transf_to`i' students_supply1`i' students_supply2`i' students_supply3`i' 
}

foreach var in $inputs1 $inputs2 $inputs3{
	quietly summarize `var' if treat==0
	local `var'_mean= r(mean)
	local `var'_sd= r(sd) 
	gen c_`var' = (`var'-``var'_mean')/``var'_sd'
	qui egen mean_std_`var'=mean(c_`var') if treat==1
	replace c_`var' = mean_std_`var' if treat==1 & c_`var'==. | `var'==.d
	replace c_`var' = 0 if treat==0 & c_`var'==. | `var'==.d
	qui egen mean_`var'1=mean(`var') if treat==1
	replace `var' = mean_`var'1 if treat==1 & `var'==. | `var'==.d
	qui egen mean_`var'0=mean(`var') if treat==0
	replace `var' = mean_`var'0 if treat==0 & `var'==. | `var'==.d
	corr `var' c_`var'
	}

* index using Anderson (20008)
	make_index_gr edu_input1 wgt stdgroup  V_n_teachers1 V_teach_transf_to1 students_supply11 students_supply21 students_supply31 
	make_index_gr edu_input2 wgt stdgroup  V_n_teachers2 V_teach_transf_to2 students_supply12 students_supply22 students_supply32 
	make_index_gr edu_input3 wgt stdgroup  V_n_teachers3 V_teach_transf_to3 students_supply13 students_supply22 students_supply33 
 
* Index using Kling et al., (2007)
 forvalues i=1(1)3{	
	alpha c_V_n_teachers`i' V_teach_transf_to`i' c_students_supply1`i' c_students_supply2`i' c_students_supply3`i', std item	
	egen input_index`i'=rowmean(c_V_n_teachers`i' c_V_teach_transf_to`i' c_students_supply1`i' c_students_supply2`i' c_students_supply3`i')									
	corr input_index`i' index_edu_input`i'
	}
 
******************
*education outcome
******************

forvalues i=1(1)3{
	gl outcome`i' A_enrollment`i' A_PLE_Grade1rate`i' A_PLE_Grade2rate`i' A_PLE_passrate`i'
	}

foreach var in $outcome1 $outcome2 $outcome3{
	quietly summarize `var' if treat==0
	local `var'_mean= r(mean)
	local `var'_sd= r(sd) 
	gen c_`var' = (`var'-``var'_mean')/``var'_sd'
	qui egen mean_std_`var'=mean(c_`var') if treat==1
	replace c_`var' = mean_std_`var' if treat==1 & c_`var'==.
	replace c_`var' = 0 if treat==0 & c_`var'==.
	qui egen mean_`var'1=mean(`var') if treat==1
	replace `var' = mean_`var'1 if treat==1 & `var'==.
	qui egen mean_`var'0=mean(`var') if treat==0
	replace `var' = mean_`var'0 if treat==0 & `var'==.
	}

forvalues i=1/2{
	alpha c_A_enrollment`i'  c_A_PLE_Grade1rate`i' c_A_PLE_Grade2rate`i' c_A_PLE_passrate`i', std item 	
	egen outcome_index`i'=rowmean(c_A_enrollment`i' c_A_PLE_Grade1rate`i' c_A_PLE_Grade2rate`i' c_A_PLE_passrate`i')									
	make_index_gr edu_outcome`i' A_enrollment`i' A_PLE_Grade1rate`i' A_PLE_Grade2rate`i' A_PLE_passrate`i'
	}
	
*********************************************************
// #12: correlate education indices and messages 
*********************************************************
	
merge m:1 DataSetID using ubridge_villages.dta, keepus(topicEducation_n topicHealth_n topicWater_n)
drop if _m==2
drop _m

renvars *_n,   subst(_n )
	
foreach var in topicEducation topicHealth topicWater Message_ct RelevantMess_ct{
	replace `var'=0 if `var'==.
	}

**********************************************************************	
* relation between distance and messages at the village-level
**********************************************************************
ren RelevantMess_ct topicTotal

**********************************************************************	
* Figure 18 in Supplementary Information
**********************************************************************

foreach	y in Education Health Water Total{
	scatter topic`y' Arua_dist if ldist>0 & treat==1 || lowess topic`y' Arua_dist if ldist>0 & treat==1,  lw(vthick) title("`y' messaging") ytitle("`y' messages (cluster)") xtitle("Distance to Arua (log)") legend(off)  name(topic`y')
	}

	graph combine topicTotal topicEducation topicHealth topicWater, title("Messaging and distance to district HQs") 
			
* merge-in number of messages sent 		
merge m:1 cluster_id using ubridge_clusters.dta, keepus(RelevantMess_ct Message_ct topicEducation)
	drop _m

**********************************************************************	
* change from baseline to midline against number of messages sent 
* 1. using Anderson weighted indices
* 2. using Kling unweighted indices 	
**********************************************************************	

lab var RelevantMess_ct "All relevant messages (cluster)"
lab var topicEducation "Education messages (cluster)"
	
foreach y in monitoring effort input outcome {
	gen D`y' = index_edu_`y'2-index_edu_`y'1
	scatter D`y' RelevantMess_ct if treat==1 || lowess D`y' RelevantMess_ct if treat==1, lw(vthick) title("School `y'") xtitle("All relevant messages") ytitle("Change `y'") name(`y'1) legend(off)
	scatter D`y' topicEducation if treat==1 & topicEducation<11 || lowess D`y' topicEducation if treat==1 & topicEducation<11, lw(vthick) title("School `y'") xtitle("Education messages") ytitle("Change `y'") name(`y'2) legend(off)	
	}

	graph combine monitoring1 effort1 input1 monitoring2 effort2  input2

**********************************************************************	
* Figure 14 in Supplementary Information
**********************************************************************	

foreach y in monitoring effort input outcome {
	gen D2`y' = `y'_index2-`y'_index1
	scatter D2`y' RelevantMess_ct if treat==1 || lowess D2`y' RelevantMess_ct if treat==1, lw(vthick) title("School `y'") xtitle("All relevant messages") ytitle("Change `y'") name(`y'3) legend(off)
	scatter D2`y' topicEducation if treat==1 & topicEducation<11 || lowess D2`y' topicEducation if treat==1 & topicEducation<11, lw(vthick) title("School `y'") xtitle("Education messages") ytitle("Change `y'") name(`y'4) legend(off)
	}

	graph combine monitoring3 effort3 input3 monitoring4 effort4 input4
			
**********************************************************************		
* baseline outcome against number of messages sent 
* 1. using Anderson weighted indices
* 2. using Kling unweighted indices
**********************************************************************	

foreach y in monitoring effort input outcome{
	local l`y' : variable label index_edu_`y'1
	scatter index_edu_`y'1 topicEducation if treat==1 & topicEducation<11 || lowess index_edu_`y'1 topicEducation if treat==1 & topicEducation<11,  lw(vthick) title("School `y'") xtitle("Education messages") ytitle(baseline edu `y') legend(off) name(`y'5)
}
	
	graph combine monitoring5 effort5 input5 outcome5, title("Messaging and baseline conditions")	

**********************************************************************	
* Figure 16 in Supplementary Information
**********************************************************************	

foreach y in monitoring effort input outcome{
	local l`y' : variable label `y'_index1
	scatter `y'_index1 topicEducation if treat==1 & topicEducation<11 || lowess `y'_index1 topicEducation if treat==1 & topicEducation<11,  lw(vthick) title("School `y'") xtitle("Education messages") ytitle(baseline edu `y') legend(off) name(`y'6)
}
	
	graph combine monitoring6 effort6 input6 outcome6, title("Messaging and baseline conditions")
	
*********************************************************
// #13: save 
*********************************************************
note: ubridge_schools_long.dta 
label data "updated education dataset (long): 05042018"
datasignature set , reset

saveold	ubridge_schools_long.dta, replace

log close
clear
exit